{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"books\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# %load getdata.py\n",
    "# Downloads the books and stores them in the below folder\n",
    "import os\n",
    "from time import sleep\n",
    "import urllib.request\n",
    "\n",
    "titles = {}\n",
    "\n",
    "\n",
    "titles['burton'] = [4657, 2400, 5760, 6036, 7111, 8821,\n",
    "                    18506, 4658, 5761, 6886, 7113]\n",
    "titles['dickens'] = [24022, 1392, 1414, 1467, 2324, 580,\n",
    "                     786, 888, 963, 27924, 1394, 1415, 15618,\n",
    "                     25985, 588, 807, 914, 967, 30127, 1400,\n",
    "                     1421, 16023, 28198, 644, 809, 917, 968, 1023,\n",
    "                     1406, 1422, 17879, 30368, 675, 810, 924, 98,\n",
    "                     1289, 1413, 1423, 17880, 32241, 699, 821, 927]\n",
    "titles['doyle'] = [2349, 11656, 1644, 22357, 2347, 290, 34627, 5148,\n",
    "                   8394, 26153, 12555, 1661, 23059, 2348, 294, 355,\n",
    "                   5260, 8727, 10446, 126, 17398, 2343, 2350, 3070,\n",
    "                   356, 5317, 903, 10581, 13152, 2038, 2344, 244, 32536,\n",
    "                   423, 537, 108, 139, 2097, 2345, 24951, 32777, 4295,\n",
    "                   7964, 11413, 1638, 21768, 2346, 2845, 3289, 439, 834]\n",
    "titles['gaboriau'] = [1748, 1651, 2736, 3336, 4604, 4002, 2451,\n",
    "                      305, 3802, 547]\n",
    "titles['nesbit'] = [34219, 23661, 28804, 4378, 778, 20404, 28725,\n",
    "                    33028, 4513, 794]\n",
    "titles['tarkington'] = [1098, 15855, 1983, 297, 402, 5798,\n",
    "                        8740, 980, 1158, 1611, 2326, 30092,\n",
    "                        483, 5949, 8867, 13275, 18259, 2595,\n",
    "                        3428, 5756, 6401, 9659]\n",
    "titles['twain'] = [1044, 1213, 245, 30092, 3176, 3179, 3183, 3189, 74,\n",
    "                   86, 1086, 142, 2572, 3173, 3177, 3180, 3186, 3192,\n",
    "                   76, 91, 119, 1837, 2895, 3174, 3178, 3181, 3187, 3432,\n",
    "                   8525]\n",
    "\n",
    "\n",
    "\n",
    "assert len(titles) == 7\n",
    "\n",
    "assert len(titles['tarkington']) == 22\n",
    "assert len(titles['dickens']) == 44\n",
    "assert len(titles['nesbit']) == 10\n",
    "assert len(titles['doyle']) == 51\n",
    "assert len(titles['twain']) == 29\n",
    "assert len(titles['burton']) == 11\n",
    "assert len(titles['gaboriau']) == 10\n",
    "\n",
    "\n",
    "# https://www.gutenberg.org\n",
    "#url_base = \"http://gutenberg.pglaf.org/\"\n",
    "#url_base = \"http://gutenberg.readingroo.ms/\"\n",
    "url_base = \"http://www.gutenberg.myebook.bg/\"\n",
    "url_format = \"{url_base}{idstring}/{id}/{id}.txt\"\n",
    "\n",
    "fixes = {}\n",
    "fixes[1044] = url_base + \"1/0/4/1044/1044-0.txt\"\n",
    "fixes[5148] = url_base + \"5/1/4/5148/5148-0.txt\"\n",
    "fixes[4657] = \"https://archive.org/stream/personalnarrativ04657gut/pnpa110.txt\"\n",
    "fixes[1467] = \"https://archive.org/stream/somechristmassto01467gut/cdscs10p_djvu.txt\"\n",
    "\n",
    "# Make parent folder if not exists\n",
    "if not os.path.exists(data_folder):\n",
    "    os.makedirs(data_folder)\n",
    "\n",
    "for author in titles:\n",
    "    print(\"Downloading titles from {author}\".format(author=author))\n",
    "    # Make author's folder if not exists\n",
    "    author_folder = os.path.join(data_folder, author)\n",
    "    if not os.path.exists(author_folder):\n",
    "        os.makedirs(author_folder)\n",
    "    # Download each title to this folder\n",
    "    for bookid in titles[author]:\n",
    "        if bookid in fixes:\n",
    "            print(\" - Applying fix to book with id {id}\".format(id=bookid))\n",
    "            url =  fixes[bookid]\n",
    "        else:\n",
    "            print(\" - Getting book with id {id}\".format(id=bookid))\n",
    "            idstring = \"/\".join([str(bookid)[i] for i in range(len(str(bookid))-1)])\n",
    "            print(bookid, idstring)\n",
    "            url = url_format.format(url_base=url_base, idstring=idstring, id=bookid)\n",
    "        print(\" - \" + url)\n",
    "        filename = os.path.join(author_folder, \"{id}.txt\".format(id=bookid))\n",
    "        if os.path.exists(filename):\n",
    "            print(\" - File already exists, skipping\")\n",
    "        else:\n",
    "            urllib.request.urlretrieve(url, filename)\n",
    "            sleep(60*5)\n",
    "print(\"Download complete\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def clean_book(document):\n",
    "    lines = document.split(\"\\n\")\n",
    "    start= 0\n",
    "    end = len(lines)\n",
    "    for i in range(len(lines)):\n",
    "        line = lines[i]\n",
    "        if line.startswith(\"*** START OF THIS PROJECT GUTENBERG\"):\n",
    "            start = i + 1\n",
    "        elif line.startswith(\"*** END OF THIS PROJECT GUTENBERG\"):\n",
    "            end = i - 1\n",
    "    return \"\\n\".join(lines[start:end])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def load_books_data(folder=data_folder):\n",
    "    documents = []\n",
    "    authors = []\n",
    "    subfolders = [subfolder for subfolder in os.listdir(folder)\n",
    "                  if os.path.isdir(os.path.join(folder, subfolder))]\n",
    "    for author_number, subfolder in enumerate(subfolders):\n",
    "        full_subfolder_path = os.path.join(folder, subfolder)\n",
    "        for document_name in os.listdir(full_subfolder_path):\n",
    "            with open(os.path.join(full_subfolder_path, document_name)) as inf:\n",
    "                documents.append(clean_book(inf.read()))\n",
    "                authors.append(author_number)\n",
    "    return documents, np.array(authors, dtype='int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "documents, classes = load_books_data(data_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "function_words = [\"a\", \"able\", \"aboard\", \"about\", \"above\", \"absent\",\n",
    "                  \"according\" , \"accordingly\", \"across\", \"after\", \"against\",\n",
    "                  \"ahead\", \"albeit\", \"all\", \"along\", \"alongside\", \"although\",\n",
    "                  \"am\", \"amid\", \"amidst\", \"among\", \"amongst\", \"amount\", \"an\",\n",
    "                    \"and\", \"another\", \"anti\", \"any\", \"anybody\", \"anyone\",\n",
    "                    \"anything\", \"are\", \"around\", \"as\", \"aside\", \"astraddle\",\n",
    "                    \"astride\", \"at\", \"away\", \"bar\", \"barring\", \"be\", \"because\",\n",
    "                    \"been\", \"before\", \"behind\", \"being\", \"below\", \"beneath\",\n",
    "                    \"beside\", \"besides\", \"better\", \"between\", \"beyond\", \"bit\",\n",
    "                    \"both\", \"but\", \"by\", \"can\", \"certain\", \"circa\", \"close\",\n",
    "                    \"concerning\", \"consequently\", \"considering\", \"could\",\n",
    "                    \"couple\", \"dare\", \"deal\", \"despite\", \"down\", \"due\", \"during\",\n",
    "                    \"each\", \"eight\", \"eighth\", \"either\", \"enough\", \"every\",\n",
    "                    \"everybody\", \"everyone\", \"everything\", \"except\", \"excepting\",\n",
    "                    \"excluding\", \"failing\", \"few\", \"fewer\", \"fifth\", \"first\",\n",
    "                    \"five\", \"following\", \"for\", \"four\", \"fourth\", \"from\", \"front\",\n",
    "                    \"given\", \"good\", \"great\", \"had\", \"half\", \"have\", \"he\",\n",
    "                    \"heaps\", \"hence\", \"her\", \"hers\", \"herself\", \"him\", \"himself\",\n",
    "                    \"his\", \"however\", \"i\", \"if\", \"in\", \"including\", \"inside\",\n",
    "                    \"instead\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keeping\",\n",
    "                    \"lack\", \"less\", \"like\", \"little\", \"loads\", \"lots\", \"majority\",\n",
    "                    \"many\", \"masses\", \"may\", \"me\", \"might\", \"mine\", \"minority\",\n",
    "                    \"minus\", \"more\", \"most\", \"much\", \"must\", \"my\", \"myself\",\n",
    "                    \"near\", \"need\", \"neither\", \"nevertheless\", \"next\", \"nine\",\n",
    "                    \"ninth\", \"no\", \"nobody\", \"none\", \"nor\", \"nothing\",\n",
    "                    \"notwithstanding\", \"number\", \"numbers\", \"of\", \"off\", \"on\",\n",
    "                    \"once\", \"one\", \"onto\", \"opposite\", \"or\", \"other\", \"ought\",\n",
    "                    \"our\", \"ours\", \"ourselves\", \"out\", \"outside\", \"over\", \"part\",\n",
    "                    \"past\", \"pending\", \"per\", \"pertaining\", \"place\", \"plenty\",\n",
    "                    \"plethora\", \"plus\", \"quantities\", \"quantity\", \"quarter\",\n",
    "                    \"regarding\", \"remainder\", \"respecting\", \"rest\", \"round\",\n",
    "                    \"save\", \"saving\", \"second\", \"seven\", \"seventh\", \"several\",\n",
    "                    \"shall\", \"she\", \"should\", \"similar\", \"since\", \"six\", \"sixth\",\n",
    "                    \"so\", \"some\", \"somebody\", \"someone\", \"something\", \"spite\",\n",
    "                    \"such\", \"ten\", \"tenth\", \"than\", \"thanks\", \"that\", \"the\",\n",
    "                    \"their\", \"theirs\", \"them\", \"themselves\", \"then\", \"thence\",\n",
    "                  \"therefore\", \"these\", \"they\", \"third\", \"this\", \"those\",\n",
    "\"though\", \"three\", \"through\", \"throughout\", \"thru\", \"thus\",\n",
    "\"till\", \"time\", \"to\", \"tons\", \"top\", \"toward\", \"towards\",\n",
    "\"two\", \"under\", \"underneath\", \"unless\", \"unlike\", \"until\",\n",
    "\"unto\", \"up\", \"upon\", \"us\", \"used\", \"various\", \"versus\",\n",
    "\"via\", \"view\", \"wanting\", \"was\", \"we\", \"were\", \"what\",\n",
    "\"whatever\", \"when\", \"whenever\", \"where\", \"whereas\",\n",
    "\"wherever\", \"whether\", \"which\", \"whichever\", \"while\",\n",
    "                  \"whilst\", \"who\", \"whoever\", \"whole\", \"whom\", \"whomever\",\n",
    "\"whose\", \"will\", \"with\", \"within\", \"without\", \"would\", \"yet\",\n",
    "\"you\", \"your\", \"yours\", \"yourself\", \"yourselves\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "extractor = CountVectorizer(vocabulary=function_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn import grid_search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\n",
    "svr = SVC()\n",
    "grid = grid_search.GridSearchCV(svr, parameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pipeline1 = Pipeline([('feature_extraction', extractor),\n",
    "                      ('clf', grid)\n",
    "                     ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "scores = cross_val_score(pipeline1, documents, classes,\n",
    "scoring='f1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(np.mean(scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pipeline = Pipeline([('feature_extraction',\n",
    "CountVectorizer(analyzer='char', ngram_range=(3, 3))),\n",
    "('classifier', grid)\n",
    "])\n",
    "scores = cross_val_score(pipeline, documents, classes,\n",
    "scoring='f1')\n",
    "print(\"Score: {:.3f}\".format(np.mean(scores)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "enron_data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\",\n",
    "\"enron_mail_20110402\", \"maildir\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from email.parser import Parser\n",
    "p = Parser()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.utils import check_random_state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_enron_corpus(num_authors=10, data_folder=data_folder,\n",
    "                     min_docs_author=10, max_docs_author=100,\n",
    "                     random_state=None):\n",
    "    random_state = check_random_state(random_state)\n",
    "    email_addresses = sorted(os.listdir(data_folder))\n",
    "    random_state.shuffle(email_addresses)\n",
    "    documents = []\n",
    "    classes = []\n",
    "    author_num = 0\n",
    "    authors = {}\n",
    "    for user in email_addresses:\n",
    "        users_email_folder = os.path.join(data_folder, user)\n",
    "        mail_folders = [os.path.join(users_email_folder, subfolder)\n",
    "                        for subfolder in os.listdir(users_email_folder)\n",
    "                        if \"sent\" in subfolder]\n",
    "        try:\n",
    "            authored_emails = [open(os.path.join(mail_folder, email_filename), encoding='cp1252').read()\n",
    "                               for mail_folder in mail_folders\n",
    "                               for email_filename in os.listdir(mail_folder)]\n",
    "        except IsADirectoryError:\n",
    "            continue\n",
    "        if len(authored_emails) < min_docs_author:\n",
    "            continue\n",
    "        if len(authored_emails) > max_docs_author:\n",
    "            authored_emails = authored_emails[:max_docs_author]\n",
    "        contents = [p.parsestr(email)._payload for email in authored_emails]\n",
    "        documents.extend(contents)\n",
    "        classes.extend([author_num] * len(authored_emails))\n",
    "        authors[user] = author_num\n",
    "        author_num += 1\n",
    "        if author_num >= num_authors or author_num >= len(email_addresses):\n",
    "            break\n",
    "    return documents, np.array(classes), authors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "documents, classes, authors = get_enron_corpus(data_folder=enron_data_folder, random_state=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "documents[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import quotequail"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def remove_replies(email_contents):\n",
    "    r = quotequail.unwrap(email_contents)\n",
    "    if r is None:\n",
    "        return email_contents\n",
    "    if 'text_top' in r:\n",
    "        return r['text_top']\n",
    "    elif 'text' in r:\n",
    "        return r['text']\n",
    "    return email_contents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "documents = [remove_replies(document) for document in documents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "scores = cross_val_score(pipeline, documents, classes, scoring='f1')\n",
    "print(\"Score: {:.3f}\".format(np.mean(scores)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cross_validation import train_test_split\n",
    "training_documents, testing_documents, y_train, y_test = train_test_split(documents, classes, random_state=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pipeline.fit(training_documents, y_train)\n",
    "y_pred = pipeline.predict(testing_documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(pipeline.named_steps['classifier'].best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "cm = confusion_matrix(y_pred, y_test)\n",
    "cm = cm / cm.astype(np.float).sum(axis=1)\n",
    "sorted_authors = sorted(authors.keys(), key=lambda x:authors[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "plt.figure(figsize=(30, 30))\n",
    "plt.imshow(cm, cmap='Blues')\n",
    "tick_marks = np.arange(len( sorted_authors ))\n",
    "plt.xticks(tick_marks, sorted_authors )\n",
    "plt.yticks(tick_marks, sorted_authors )\n",
    "plt.ylabel('Actual')\n",
    "plt.xlabel('Predicted')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
